Observed log-chlorophyll at representative station in SF Bay Delta region.

library(tidyverse)
library(lubridate)
library(mgcv)  
library(plotly)
library(WRTDStidal)
library(gridExtra)
source('R/funcs.R')

# flow data, left moving window average of 30 days
data(flow_dat)
fl_dat <- flow_dat %>% 
  rename(date = Date) %>% 
  filter(station %in% 'sjr') %>% 
  mutate(
    qsm = stats::filter(q, rep(1, 30)/30, sides = 1, method = 'convolution')
  )
  
# format the data to model
data(sf_dat)
sf_sub <- sf_dat %>% 
  filter(Site_Code %in% 'P8') %>% 
  rename(date = Date) %>% 
  mutate(
    doy = yday(date), 
    dec_time = decimal_date(date), 
    yr = year(date),
    mo = month(date, label = T)
  ) %>% 
  left_join(fl_dat, by = 'date') %>% 
  mutate(
    flo = log(qsm), 
    lnchl = log(chl)
    ) %>% 
  select(-q, -qsm, -station, -Latitude, -Longitude, -Location)

# plot, all
p <- ggplot(sf_sub, aes(x = date, y = lnchl)) + 
  geom_line() +
  theme_bw() 
ggplotly(p)
# boxplot, by years
p <- ggplot(sf_sub, aes(x = yr, y = lnchl)) + 
  geom_boxplot() + 
  theme_bw()
ggplotly(p)
# boxplot, by month
p <- ggplot(sf_sub, aes(x = mo, y = lnchl)) + 
  geom_boxplot() + 
  theme_bw()
ggplotly(p)

Some simple GAMs to explore annual, seasonal trends.

# annual only
mod1 <- gam(lnchl ~ s(dec_time, bs = 'tp'),
  data = sf_sub, 
  na.action = na.exclude
  )

# seasonal only
mod2 <- gam(lnchl ~ s(doy, bs = 'cc'), 
  knots = list(doy = c(1, 366)),
  data = sf_sub, 
  na.action = na.exclude
  )
 
# annual and seasonal, additive
mod3 <- gam(lnchl ~ s(dec_time, bs = 'tp') + 
  s(doy, bs = 'cc'), 
  knots = list(doy = c(1, 366)),
  data = sf_sub, 
  na.action = na.exclude
  )

# annual and seasonal, interaction
mod4 <- gam(lnchl ~ te(dec_time, doy, bs = c('tp', 'cc')),
  knots = list(doy = c(1, 366)),
  data = sf_sub, 
  na.action = na.exclude
  )

Summary stats of annual, seasonal models:

mods <- list(
  mod1 = mod1,
  mod2 = mod2, 
  mod3 = mod3,
  mod4 = mod4
  )

# smoother stats of GAMs
map(mods, ~ summary(.x)$s.table %>% data.frame %>% rownames_to_column('smoother')) %>% 
  enframe %>% 
  unnest %>% 
  kable
name smoother edf Ref.df F p.value
mod1 s(dec_time) 8.212548 8.830184 32.55938 0
mod2 s(doy) 7.083832 8.000000 16.06665 0
mod3 s(dec_time) 8.394141 8.896837 41.39759 0
mod3 s(doy) 7.393221 8.000000 24.13289 0
mod4 te(dec_time,doy) 16.859549 18.456279 25.60994 0
# summary stats of GAMs
map(mods, ~ data.frame(
    AIC = AIC(.x), 
    R2 = summary(.x)$r.sq)) %>% 
  enframe %>% 
  unnest %>% 
  kable
name AIC R2
mod1 1188.287 0.3406925
mod2 1304.711 0.1845578
mod3 1030.524 0.5109149
mod4 1083.738 0.4625216
# prediction data
pred_dat <- data.frame(
    dec_time = seq(min(sf_sub$dec_time), max(sf_sub$dec_time), length = 1000)
  ) %>% 
  mutate(
    date = date_decimal(dec_time), 
    date = as.Date(date),
    mo = month(date, label = TRUE), 
    doy = yday(date), 
    yr = year(date)
  ) %>% 
  left_join(., fl_dat[, c('date', 'qsm')]) %>% 
  mutate(flo = log(qsm)) %>% 
  select(-qsm)

# predictions
sf_res <- pred_dat %>% 
  mutate(
    mod1 = predict(mod1, newdata = pred_dat), 
    mod2 = predict(mod2, newdata = pred_dat), 
    mod3 = predict(mod3, newdata = pred_dat), 
    mod4 = predict(mod4, newdata = pred_dat)
  ) %>% 
  tidyr::gather('mods', 'pred', -date, -mo, -doy, -dec_time, -yr, -flo)

# plot
p <- ggplot(sf_res, aes(x = date)) + 
  geom_point(data = sf_sub, aes(y = lnchl), size = 0.5) + 
  geom_line(aes(y = pred, colour = mods)) + 
  theme_bw() + 
  theme(
    legend.position = 'top', 
    legend.title = element_blank()
    )
ggplotly(p)
# plot
p <- ggplot(sf_res, aes(x = doy, group = factor(yr), colour = yr)) + 
  geom_line(aes(y = pred)) + 
  theme_bw() + 
  theme(
    legend.position = 'top', 
    legend.title = element_blank()
    ) + 
  facet_wrap(~ mods, ncol = 2)
ggplotly(p)

Adding flow data to the model:

# model, all terms additive
mod5 <- gam(lnchl ~ s(dec_time, bs = 'tp') + s(doy, bs = 'cc') + s(flo, bs = 'tp'),
  knots = list(doy = c(1, 366)),
  data = sf_sub,
  na.action = na.exclude
  )

# model, all terms additive
mod6 <- gam(lnchl ~ te(dec_time, doy, bs = c('tp', 'cc')) + s(flo, bs = 'tp'),
  knots = list(doy = c(1, 366)),
  data = sf_sub,
  na.action = na.exclude
  )

# model, all terms additive
mod7 <- gam(lnchl ~ te(dec_time, flo, bs = c('tp', 'tp')) + s(doy, bs = 'cc'),
  knots = list(doy = c(1, 366)),
  data = sf_sub,
  na.action = na.exclude
  )

# model, all terms additive
mod8 <- gam(lnchl ~ te(flo, doy, bs = c('tp', 'cc')) + s(dec_time, bs = 'tp'),
  knots = list(doy = c(1, 366)),
  data = sf_sub,
  na.action = na.exclude
  )

# model, all terms additive
mod9 <- gam(lnchl ~ te(flo, doy, bs = c('tp', 'cc')) + te(flo, dec_time, bs = c('tp', 'tp')),
  knots = list(doy = c(1, 366)),
  data = sf_sub,
  na.action = na.exclude
  )

# model using a tensor product smooth for interactions
mod10 <- gam(lnchl ~ te(dec_time, doy, flo, bs = c('tp', 'cc', 'tp')),
  knots = list(doy = c(1, 366)),
  data = sf_sub,
  na.action = na.exclude
  )

Summary stats of annual, seasonal, flow models:

mods2 <- list(
  mod5 = mod5,
  mod6 = mod6, 
  mod7 = mod7,
  mod8 = mod8,
  mod9 = mod9, 
  mod10 = mod10
  )

# smoother stats of GAMs
map(mods2, ~ summary(.x)$s.table %>% data.frame %>% rownames_to_column('smoother')) %>% 
  enframe %>% 
  unnest %>% 
  kable
name smoother edf Ref.df F p.value
mod5 s(dec_time) 8.443642 8.910357 41.055072 0.0000000
mod5 s(doy) 7.410168 8.000000 23.576338 0.0000000
mod5 s(flo) 4.927544 6.039680 2.104444 0.0484013
mod6 te(dec_time,doy) 16.848322 18.443527 24.487013 0.0000000
mod6 s(flo) 4.994615 6.104263 1.428754 0.1893507
mod7 te(dec_time,flo) 17.179510 18.932122 17.608032 0.0000000
mod7 s(doy) 7.370335 8.000000 23.899560 0.0000000
mod8 te(flo,doy) 12.456261 14.898789 13.692255 0.0000000
mod8 s(dec_time) 8.546913 8.935920 41.075520 0.0000000
mod9 te(flo,doy) 12.868088 15.106326 12.703471 0.0000000
mod9 te(flo,dec_time) 13.179584 20.000000 14.760514 0.0000000
mod10 te(dec_time,doy,flo) 42.876219 53.581394 10.425867 0.0000000
# summary stats of GAMs
map(mods2, ~ data.frame(
    AIC = AIC(.x), 
    R2 = summary(.x)$r.sq)) %>% 
  enframe %>% 
  unnest %>% 
  kable
name AIC R2
mod5 1023.706 0.5210944
mod6 1081.051 0.4697431
mod7 1060.274 0.4916756
mod8 1022.771 0.5220878
mod9 1059.692 0.4935151
mod10 1062.573 0.5050973
# predictions
sf_res2 <- pred_dat %>% 
  mutate(
    mod5 = predict(mod5, newdata = pred_dat), 
    mod6 = predict(mod6, newdata = pred_dat), 
    mod7 = predict(mod7, newdata = pred_dat), 
    mod8 = predict(mod8, newdata = pred_dat),
    mod9 = predict(mod9, newdata = pred_dat),
    mod10 = predict(mod10, newdata = pred_dat)
  ) %>% 
  tidyr::gather('mods', 'pred', -date, -mo, -doy, -dec_time, -yr, -flo)

# plot
p <- ggplot(sf_res2, aes(x = date)) + 
  geom_point(data = sf_sub, aes(y = lnchl), size = 0.5) + 
  geom_line(aes(y = pred, colour = mods)) + 
  theme_bw() + 
  theme(
    legend.position = 'top', 
    legend.title = element_blank()
    )
ggplotly(p)
ptheme <- theme(
  axis.title.x = element_blank(), 
  axis.title.y = element_blank()
)
cols <- 'Spectral'
p5 <- dynagam(mod5, pred_dat, ncol = 1, col_vec = cols) + 
  ptheme + 
  theme(legend.position = 'none') +
  ggtitle('mod5')
p6 <- dynagam(mod6, pred_dat, ncol = 1, col_vec = cols) + 
  ptheme + 
  theme(legend.position = 'none') + 
  ggtitle('mod6')
p7 <- dynagam(mod7, pred_dat, ncol = 1, col_vec = cols) + 
  ptheme + 
  theme(legend.position = 'none') + 
  ggtitle('mod7')
p8 <- dynagam(mod8, pred_dat, ncol = 1, col_vec = cols) + 
  ptheme + 
  theme(legend.position = 'none') + 
  ggtitle('mod8')
p9 <- dynagam(mod9, pred_dat, ncol = 1, col_vec = cols) + 
  ptheme + 
  theme(legend.position = 'none') + 
  ggtitle('mod9')
p10 <- dynagam(mod10, pred_dat, ncol = 1, col_vec = cols) + 
  ptheme + 
  ggtitle('mod10')
pleg <- g_legend(p10)
p10 <- p10 + 
  theme(legend.position = 'none')

grid.arrange(
  pleg, 
  arrangeGrob(p5, p6, p7, p8, p9, p10, ncol = 6, bottom = 'lnQ', left = 'lnchl'), 
  ncol = 1, 
  heights = c(0.1, 1)
)

Backwards model selection, see here:

mod <- gam(lnchl ~ s(dec_time, bs = 'tp') + 
              s(doy, bs = 'cc') + 
              s(flo, bs = 'tp') +
              te(flo, doy, bs = c('tp', 'cc')) + 
              te(flo, dec_time, bs = c('tp', 'tp')) +
              te(dec_time, doy, bs = c('tp', 'cc')) +
              te(dec_time, doy, flo, bs = c('tp', 'cc', 'tp')),
            knots = list(doy = c(1, 366)),
            data = sf_sub,
            na.action = na.exclude
)
summary(mod)
## 
## Family: gaussian 
## Link function: identity 
## 
## Formula:
## lnchl ~ s(dec_time, bs = "tp") + s(doy, bs = "cc") + s(flo, bs = "tp") + 
##     te(flo, doy, bs = c("tp", "cc")) + te(flo, dec_time, bs = c("tp", 
##     "tp")) + te(dec_time, doy, bs = c("tp", "cc")) + te(dec_time, 
##     doy, flo, bs = c("tp", "cc", "tp"))
## 
## Parametric coefficients:
##             Estimate Std. Error t value Pr(>|t|)    
## (Intercept)  1.61702    0.02385   67.79   <2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Approximate significance of smooth terms:
##                          edf Ref.df     F  p-value    
## s(dec_time)           8.1368   8.73 8.167 3.83e-11 ***
## s(doy)                6.6482   8.00 3.404 4.93e-05 ***
## s(flo)                4.8183   5.85 1.352  0.30477    
## te(flo,doy)          15.0000  15.00 2.191 2.80e-05 ***
## te(flo,dec_time)      6.0911  16.00 0.716  0.00603 ** 
## te(dec_time,doy)      0.8695  15.00 0.051  0.18557    
## te(dec_time,doy,flo) 11.6894  48.00 0.488  0.00111 ** 
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## R-sq.(adj) =  0.578   Deviance explained = 61.9%
## GCV = 0.34885  Scale est. = 0.31463   n = 553
AIC(mod)
## [1] 983.2766
mod <- gam(lnchl ~ s(dec_time, bs = 'tp') + 
             s(doy, bs = 'cc') + 
             te(flo, doy, bs = c('tp', 'cc')) + 
             te(flo, dec_time, bs = c('tp', 'tp')) +
             te(dec_time, doy, bs = c('tp', 'cc')) +
             te(dec_time, doy, flo, bs = c('tp', 'cc', 'tp')),
           knots = list(doy = c(1, 366)),
           data = sf_sub,
           na.action = na.exclude
)
summary(mod)
## 
## Family: gaussian 
## Link function: identity 
## 
## Formula:
## lnchl ~ s(dec_time, bs = "tp") + s(doy, bs = "cc") + te(flo, 
##     doy, bs = c("tp", "cc")) + te(flo, dec_time, bs = c("tp", 
##     "tp")) + te(dec_time, doy, bs = c("tp", "cc")) + te(dec_time, 
##     doy, flo, bs = c("tp", "cc", "tp"))
## 
## Parametric coefficients:
##             Estimate Std. Error t value Pr(>|t|)    
## (Intercept)   1.6170     0.0243   66.54   <2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Approximate significance of smooth terms:
##                         edf Ref.df     F  p-value    
## s(dec_time)           8.289  8.806 9.208 1.02e-12 ***
## s(doy)                6.929  8.000 2.719 0.000229 ***
## te(flo,doy)          11.528 13.168 1.543 0.122932    
## te(flo,dec_time)      7.804 16.000 0.967 0.003855 ** 
## te(dec_time,doy)      3.258 12.000 0.589 0.006213 ** 
## te(dec_time,doy,flo)  3.925 48.000 0.087 0.146334    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## R-sq.(adj) =  0.562   Deviance explained = 59.6%
## GCV = 0.35391  Scale est. = 0.32656   n = 553
AIC(mod)
## [1] 993.4513
mod <- gam(lnchl ~ s(dec_time, bs = 'tp') + 
             s(doy, bs = 'cc') + 
             te(flo, doy, bs = c('tp', 'cc')) + 
             te(flo, dec_time, bs = c('tp', 'tp')) +
             te(dec_time, doy, bs = c('tp', 'cc')),
           knots = list(doy = c(1, 366)),
           data = sf_sub,
           na.action = na.exclude
)
summary(mod)
## 
## Family: gaussian 
## Link function: identity 
## 
## Formula:
## lnchl ~ s(dec_time, bs = "tp") + s(doy, bs = "cc") + te(flo, 
##     doy, bs = c("tp", "cc")) + te(flo, dec_time, bs = c("tp", 
##     "tp")) + te(dec_time, doy, bs = c("tp", "cc"))
## 
## Parametric coefficients:
##             Estimate Std. Error t value Pr(>|t|)    
## (Intercept)  1.61702    0.02439   66.29   <2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Approximate significance of smooth terms:
##                     edf Ref.df     F  p-value    
## s(dec_time)       8.263   8.80 9.372 5.75e-13 ***
## s(doy)            6.986   8.00 2.738 0.000219 ***
## te(flo,doy)      11.493  13.19 1.610 0.080277 .  
## te(flo,dec_time)  7.693  16.00 0.994 0.008297 ** 
## te(dec_time,doy)  3.667  12.00 0.881 0.005124 ** 
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## R-sq.(adj) =  0.559   Deviance explained = 58.9%
## GCV = 0.35411  Scale est. = 0.32907   n = 553
AIC(mod)
## [1] 994.3408
mod <- gam(lnchl ~ s(dec_time, bs = 'tp') + 
             s(doy, bs = 'cc') + 
             te(flo, dec_time, bs = c('tp', 'tp')) +
             te(dec_time, doy, bs = c('tp', 'cc')),
           knots = list(doy = c(1, 366)),
           data = sf_sub,
           na.action = na.exclude
)
summary(mod)
## 
## Family: gaussian 
## Link function: identity 
## 
## Formula:
## lnchl ~ s(dec_time, bs = "tp") + s(doy, bs = "cc") + te(flo, 
##     dec_time, bs = c("tp", "tp")) + te(dec_time, doy, bs = c("tp", 
##     "cc"))
## 
## Parametric coefficients:
##             Estimate Std. Error t value Pr(>|t|)    
## (Intercept)  1.61702    0.02505   64.54   <2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Approximate significance of smooth terms:
##                    edf Ref.df      F  p-value    
## s(dec_time)      8.305   8.81 11.151 6.61e-16 ***
## s(doy)           7.364   8.00 12.193  < 2e-16 ***
## te(flo,dec_time) 6.293  20.00  0.773  0.00529 ** 
## te(dec_time,doy) 4.248  15.00  0.824  0.00324 ** 
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## R-sq.(adj) =  0.535   Deviance explained = 55.7%
## GCV = 0.36508  Scale est. = 0.34712   n = 553
AIC(mod)
## [1] 1012.738